recordImportTree,
canImportKeys,
ImportResult(..),
+ Imported,
importChanges,
importKeys,
makeImportMatcher,
import Annex.HashObject
import Annex.Transfer
import Annex.CheckIgnore
+import Annex.CatFile
import Annex.VectorClock
import Command
import Backend
:: Remote
-> ImportTreeConfig
-> ImportCommitConfig
- -> ImportableContentsChunkable Annex (Either Sha Key)
+ -> Imported
-> Annex (Maybe Ref)
-buildImportCommit remote importtreeconfig importcommitconfig importable =
+buildImportCommit remote importtreeconfig importcommitconfig imported =
case importCommitTracking importcommitconfig of
Nothing -> go Nothing
Just trackingcommit -> inRepo (Git.Ref.tree trackingcommit) >>= \case
Just _ -> go (Just trackingcommit)
where
go trackingcommit = do
- (imported, updatestate) <- recordImportTree remote importtreeconfig importable
- buildImportCommit' remote importcommitconfig trackingcommit imported >>= \case
+ (importedtree, updatestate) <- recordImportTree remote importtreeconfig imported
+ buildImportCommit' remote importcommitconfig trackingcommit importedtree >>= \case
Just finalcommit -> do
updatestate
return (Just finalcommit)
recordImportTree
:: Remote
-> ImportTreeConfig
- -> ImportableContentsChunkable Annex (Either Sha Key)
+ -> Imported
-> Annex (History Sha, Annex ())
-recordImportTree remote importtreeconfig importable = do
- imported@(History finaltree _) <- buildImportTrees basetree subdir importable
- return (imported, updatestate finaltree)
+recordImportTree remote importtreeconfig imported = do
+ importedtree@(History finaltree _) <- buildImportTrees basetree subdir imported
+ return (importedtree, updatestate finaltree)
where
basetree = case importtreeconfig of
ImportTree -> emptyTree
parents <- mapM (mknewcommits oldhc old) (S.toList hs)
mkcommit parents importedtree
-{- Builds a history of git trees reflecting the ImportableContents.
+{- Builds a history of git trees for an import.
-
- - When a subdir is provided, imported tree is grafted into the basetree at
- - that location, replacing any object that was there.
+ - When a subdir is provided, the imported tree is grafted into
+ - the basetree at that location, replacing any object that was there.
-}
buildImportTrees
:: Ref
-> Maybe TopFilePath
- -> ImportableContentsChunkable Annex (Either Sha Key)
+ -> Imported
-> Annex (History Sha)
-buildImportTrees = buildImportTreesGeneric convertImportTree
+buildImportTrees basetree msubdir (ImportedFull imported) =
+ buildImportTreesGeneric convertImportTree basetree msubdir imported
+buildImportTrees basetree msubdir (ImportedDiff (LastImportedTree oldtree) imported) = do
+ importtree <- if null (importableContents imported)
+ then pure oldtree
+ else applydiff
+ repo <- Annex.gitRepo
+ t <- withMkTreeHandle repo $
+ graftImportTree basetree msubdir importtree
+ -- Diffing is not currently implemented when the history is not empty.
+ return (History t mempty)
+ where
+ applydiff = do
+ let (removed, new) = partition isremoved
+ (importableContents imported)
+ newtreeitems <- catMaybes <$> mapM mktreeitem new
+ let removedfiles = map (mkloc . fst) removed
+ inRepo $ adjustTree
+ (pure . Just)
+ -- ^ keep files that are not added/removed the same
+ newtreeitems
+ (\_oldti newti -> newti)
+ -- ^ prefer newly added version of file
+ removedfiles
+ oldtree
+
+ mktreeitem (loc, DiffChanged v) =
+ Just <$> mkImportTreeItem msubdir loc v
+ mktreeitem (_, DiffRemoved) =
+ pure Nothing
+
+ mkloc = asTopFilePath . fromImportLocation
+
+ isremoved (_, v) = v == DiffRemoved
convertImportTree :: Maybe TopFilePath -> [(ImportLocation, Either Sha Key)] -> Annex Tree
-convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls
+convertImportTree msubdir ls =
+ treeItemsToTree <$> mapM (uncurry $ mkImportTreeItem msubdir) ls
+
+mkImportTreeItem :: Maybe TopFilePath -> ImportLocation -> Either Sha Key -> Annex TreeItem
+mkImportTreeItem msubdir loc v = case v of
+ Right k -> do
+ relf <- fromRepo $ fromTopFilePath topf
+ symlink <- calcRepo $ gitAnnexLink relf k
+ linksha <- hashSymlink symlink
+ return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha
+ Left sha ->
+ return $ TreeItem treepath (fromTreeItemType TreeFile) sha
where
- mktreeitem (loc, v) = case v of
- Right k -> do
- relf <- fromRepo $ fromTopFilePath topf
- symlink <- calcRepo $ gitAnnexLink relf k
- linksha <- hashSymlink symlink
- return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha
- Left sha ->
- return $ TreeItem treepath (fromTreeItemType TreeFile) sha
- where
- lf = fromImportLocation loc
- treepath = asTopFilePath lf
- topf = asTopFilePath $
- maybe lf (\sd -> getTopFilePath sd P.</> lf) msubdir
+ lf = fromImportLocation loc
+ treepath = asTopFilePath lf
+ topf = asTopFilePath $
+ maybe lf (\sd -> getTopFilePath sd P.</> lf) msubdir
{- Builds a history of git trees using ContentIdentifiers.
-
where
ia = Remote.importActions remote
+-- Result of an import. ImportUnfinished indicates that some file failed to
+-- be imported. Running again should resume where it left off.
+data ImportResult t
+ = ImportFinished t
+ | ImportUnfinished
+
data Diffed t
= DiffChanged t
| DiffRemoved
+ deriving (Eq)
-{- Diffs between the current and previous ContentIdentifier trees, and
+data Imported
+ = ImportedFull (ImportableContentsChunkable Annex (Either Sha Key))
+ | ImportedDiff LastImportedTree (ImportableContents (Diffed (Either Sha Key)))
+
+newtype LastImportedTree = LastImportedTree Sha
+
+{- Diffs between the previous and current ContentIdentifier trees, and
- runs importKeys on only the changed files.
-
- This will download the same content as if importKeys were run on all
-> Bool
-> Bool
-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
- -> Annex (ImportResult (Either
- (ImportableContentsChunkable Annex (Either Sha Key))
- (ImportableContentsChunkable Annex (Diffed (Either Sha Key)))))
+ -> Annex (ImportResult Imported)
importChanges remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
((History currcidtree currhistory), cidtreemap) <- buildContentIdentifierTree importablecontents
-- diffimport below does not handle history, so when there is
else do
getContentIdentifierTree (Remote.uuid remote) >>= \case
Nothing -> fullimport currcidtree
- Just prevcidtree -> diffimport cidtreemap prevcidtree currcidtree
+ Just prevcidtree -> candiffimport prevcidtree >>= \case
+ Nothing -> fullimport currcidtree
+ Just lastimportedtree -> diffimport cidtreemap prevcidtree currcidtree lastimportedtree
where
remember = recordContentIdentifierTree (Remote.uuid remote)
+ -- In order to use a diff, the previous ContentIdentifier tree must
+ -- not have been garbage collected. Which can happen since there
+ -- are no git refs to it.
+ --
+ -- Also, a tree must have been imported before, and that tree must
+ -- also have not been garbage collected (which is less likely to
+ -- happen due to the remote tracking branch).
+ candiffimport prevcidtree =
+ catObjectMetaData prevcidtree >>= \case
+ Nothing -> return Nothing
+ Just _ -> getLastImportedTree remote >>= \case
+ Nothing -> return Nothing
+ Just lastimported@(LastImportedTree t) ->
+ ifM (isJust <$> catObjectMetaData t)
+ ( return (Just lastimported)
+ , return Nothing
+ )
+
fullimport currcidtree =
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents >>= \case
ImportUnfinished -> return ImportUnfinished
ImportFinished r -> do
remember currcidtree
- return $ ImportFinished $ Left r
-
- diffimport cidtreemap prevcidtree currcidtree = do
- (diff, cleanup) <- inRepo $ Git.DiffTree.diffTreeRecursive currcidtree prevcidtree
- let (removed, changed) = partition (\ti -> Git.DiffTree.dstsha ti `elem` nullShas) diff
- let mkloc = mkImportLocation . getTopFilePath . Git.DiffTree.file
+ return $ ImportFinished $ ImportedFull r
+
+ diffimport cidtreemap prevcidtree currcidtree lastimportedtree = do
+ (diff, cleanup) <- inRepo $ Git.DiffTree.diffTreeRecursive
+ prevcidtree
+ currcidtree
+ let (removed, changed) = partition isremoval diff
let mkicchanged ti = do
v <- M.lookup (Git.DiffTree.dstsha ti) cidtreemap
return (mkloc ti, v)
let ic = ImportableContentsComplete $ ImportableContents
- { importableContents = mapMaybe mkicchanged changed
- , importableHistory = []
- }
+ { importableContents = mapMaybe mkicchanged changed
+ , importableHistory = []
+ }
importKeys remote importtreeconfig importcontent thirdpartypopulated ic >>= \case
ImportUnfinished -> do
void $ liftIO cleanup
return ImportUnfinished
- ImportFinished (ImportableContentsComplete ic') -> liftIO cleanup >>= \case
- False -> return ImportUnfinished
- True -> do
- remember currcidtree
- let diffchanged = map
- (\(loc, v) -> (loc, DiffChanged v))
- (importableContents ic')
- let diffremoved = map
- (\ti -> (mkloc ti, DiffRemoved))
- removed
- let ic'' = ImportableContentsComplete $ ImportableContents
- { importableContents = diffremoved ++ diffchanged
- , importableHistory = []
- }
- return $ ImportFinished $ Right ic''
+ ImportFinished (ImportableContentsComplete ic') ->
+ liftIO cleanup >>= \case
+ False -> return ImportUnfinished
+ True -> do
+ remember currcidtree
+ return $ ImportFinished $
+ ImportedDiff lastimportedtree
+ (mkdiff ic' removed)
-- importKeys is not passed ImportableContentsChunked
-- above, so it cannot return it
ImportFinished (ImportableContentsChunked {}) -> error "internal"
+
+ isremoval ti = Git.DiffTree.dstsha ti `elem` nullShas
+
+ mkloc = mkImportLocation . getTopFilePath . Git.DiffTree.file
--- Result of an import. ImportUnfinished indicates that some file failed to
--- be imported. Running again should resume where it left off.
-data ImportResult t
- = ImportFinished t
- | ImportUnfinished
+ mkdiff ic removed = ImportableContents
+ { importableContents = diffremoved ++ diffchanged
+ , importableHistory = []
+ }
+ where
+ diffchanged = map
+ (\(loc, v) -> (loc, DiffChanged v))
+ (importableContents ic)
+ diffremoved = map
+ (\ti -> (mkloc ti, DiffRemoved))
+ removed
+
+{- Gets the tree that was last imported from the remote
+ - (or exported to it if an export happened after the last import).
+ -}
+getLastImportedTree :: Remote -> Annex (Maybe LastImportedTree)
+getLastImportedTree remote = do
+ db <- Export.openDb (Remote.uuid remote)
+ mtree <- liftIO $ Export.getExportTreeCurrent db
+ Export.closeDb db
+ return (LastImportedTree <$> mtree)
{- Downloads all new ContentIdentifiers, or when importcontent is False,
- generates Keys without downloading.
Still, it would be good to find some ways to speed it up.
+In particular, speeding up repeated imports from the same special remote,
+when only a few files have changed, would make it much more useful. It's ok
+to pay a somewhat expensive price to import a lot of new files, if updates
+are quick after that.
+
---
+A major thing that makes it slow, when a remote contains
+many files, is converting from ContentIdentifiers to Keys.
+It does a cidsdb lookup for every file, before it knows if the file has
+changed or not, which gets slow with a lot of files.
+
What if it generated a git tree, where each file in the tree is
a sha1 hash of the ContentIdentifier. The tree can just be recorded locally
somewhere. It's ok if it gets garbage collected; it's only an optimisation.
On the next sync, diff from the old to the new tree. It only needs to
-import the changed files!
+import the changed files, and can avoid the cidsdb lookup for the
+unchanged files!
(That is assuming that ContentIdentifiers don't tend to sha1 collide.
If there was a collision it would fail to import the new file. But it seems
are no more likely to collide than the content of files, and probably less
likely overall..)
-How fast can a git tree of say, 10000 files be generated? Is it faster than
-querying sqlite 10000 times?
-
-Once it knows which files are changed, it still needs to generate the
-imported tree, which contains both changed and unchanged files. How to
-handle unchanged files when generating that tree? Current method is
-to do a database lookup to convert the ContentIdentifier into a Key, and
-record that in the tree. But those database lookups are the slow thing that
-needs to be avoided. Seems like it will need to either use adjustTree, or a
-separate index file. (The index file would make importing a History hard.)
-
-----
-
-Another idea would to be use something faster than sqlite to record the cid
-to key mappings. Looking up those mappings is the main thing that makes
-import slow when only a few files have changed and a large number have not.
-
---[[Joey]]
+> I implemented this optimisation. Importing from a special remote that
+> has 10000 files, that have all been imported before, and 1 new file
+> sped up from 26.06 to 2.59 seconds. An import with no changes sped
+> up from 24.3 to 1.99 seconds. Going up to 20000 files, an import with
+> no changes sped up from 125.95 to 3.84 seconds.
+> (All measured with warm cache.)
+
+> (Note that I have only implemented this optimisation for imports that
+> do not include History. So importing from versioned S3 buckets will
+> still be slow. It would be possible to do a similar optimisation for
+> History, but it seemed complicated so I punted.) --[[Joey]]